In [1]:
pip install pandas openpyxl xlrd
Requirement already satisfied: pandas in c:\users\megha\anaconda3\lib\site-packages (2.1.4)
Requirement already satisfied: openpyxl in c:\users\megha\anaconda3\lib\site-packages (3.0.10)
Requirement already satisfied: xlrd in c:\users\megha\anaconda3\lib\site-packages (2.0.1)
Requirement already satisfied: numpy<2,>=1.23.2 in c:\users\megha\anaconda3\lib\site-packages (from pandas) (1.24.3)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\megha\anaconda3\lib\site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\megha\anaconda3\lib\site-packages (from pandas) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\megha\anaconda3\lib\site-packages (from pandas) (2023.3)
Requirement already satisfied: et_xmlfile in c:\users\megha\anaconda3\lib\site-packages (from openpyxl) (1.1.0)
Requirement already satisfied: six>=1.5 in c:\users\megha\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [2]:
import pandas as pd
In [8]:
# Replace 'file_path.xlsx' with the actual path to your Excel file
df_ori = pd.read_excel(r'C:\Users\megha\OneDrive\Desktop\Online Retail.xlsx')
df_ori.head()
Out[8]:
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01 08:26:00 2.55 17850.0 United Kingdom
1 536365 71053 WHITE METAL LANTERN 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 2010-12-01 08:26:00 2.75 17850.0 United Kingdom
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
In [9]:
df_copy = df_ori.copy()
df_ori[:3]
Out[9]:
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01 08:26:00 2.55 17850.0 United Kingdom
1 536365 71053 WHITE METAL LANTERN 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 2010-12-01 08:26:00 2.75 17850.0 United Kingdom
In [18]:
import plotly.express as px
from datetime import timedelta
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import math
import numpy as np
sns.set_theme()
import warnings
warnings.filterwarnings("ignore")
def describe(df, pred=None):
    """This function describes a dataframe's basic information."""
    obs = df.shape[0]
    numeric_cols = df.select_dtypes(include=np.number).columns  # Select only numeric columns
    types = df.dtypes
    counts = df.apply(lambda x: x.count())
    uniques = df.apply(lambda x: [x.unique()])

    distincts = df.apply(lambda x: x.unique().shape[0])
    missing_ratio = (df.isnull().sum() / obs) * 100
    skewness = df[numeric_cols].skew()  # Compute skewness for numeric columns only
    kurtosis = df[numeric_cols].kurt()  # Compute kurtosis for numeric columns only
    
    print('Data shape:', df.shape)
    
    if pred is None:
        cols = ['types', 'counts', 'distincts', 'missing ratio', 'uniques', 'skewness', 'kurtosis']
        output = pd.concat([types, counts, distincts, missing_ratio, uniques, skewness, kurtosis], axis=1, sort=True)
    else:
        corr = df.corr()[pred]
        corr_col = 'corr ' + pred
        cols = ['types', 'counts', 'distincts', 'missing ratio', 'uniques', 'skewness', 'kurtosis', corr_col]
        output = pd.concat([types, counts, distincts, missing_ratio, uniques, skewness, kurtosis, corr], axis=1, sort=True)
        output.columns = cols  # Move this line inside the else block
    
    dtypes = df.dtypes.value_counts()
    
    print('___________________________\nData types:\n\n', dtypes)
    print('___________________________')
    
    return output

# Describe the data with modified function
details = describe(df_ori)
Data shape: (541909, 8)
___________________________
Data types:

 object            4
float64           2
int64             1
datetime64[ns]    1
Name: count, dtype: int64
___________________________
In [20]:
# Calculate null counts for each column
null_counts = df_ori.isnull().sum()
# Calculate missing ratio as percentage
missing_ratio = (df_ori.isnull().sum() / df_ori.shape[0]) * 100

# Combine null counts and missing ratio into a dataframe
null_info = pd.DataFrame({'nulls': null_counts, 'missing_ratio': missing_ratio})

# Display null information sorted by the number of nulls
display(null_info.sort_values(by='nulls', ascending=False))
nulls missing_ratio
CustomerID 135080 24.926694
Description 1454 0.268311
InvoiceNo 0 0.000000
StockCode 0 0.000000
Quantity 0 0.000000
InvoiceDate 0 0.000000
UnitPrice 0 0.000000
Country 0 0.000000
In [22]:
df_ori.describe()
Out[22]:
Quantity InvoiceDate UnitPrice CustomerID
count 541909.000000 541909 541909.000000 406829.000000
mean 9.552250 2011-07-04 13:34:57.156386048 4.611114 15287.690570
min -80995.000000 2010-12-01 08:26:00 -11062.060000 12346.000000
25% 1.000000 2011-03-28 11:34:00 1.250000 13953.000000
50% 3.000000 2011-07-19 17:17:00 2.080000 15152.000000
75% 10.000000 2011-10-19 11:27:00 4.130000 16791.000000
max 80995.000000 2011-12-09 12:50:00 38970.000000 18287.000000
std 218.081158 NaN 96.759853 1713.600303
In [23]:
df_ori = df_copy.copy()
bool_error = (df_ori['Quantity']<=0) | (df_ori['UnitPrice']<=0)
print('With Quantity negative and UnitPrice negative:', df_ori[(df_ori['Quantity']<0) & (df_ori['UnitPrice']<0)].shape[0])
print('With Quantity or UnitPrice negative or equals to zero:', df_ori[bool_error].shape[0])
print('Values errors by percentage:', str(df_ori[bool_error].shape[0]/df_ori.shape[0]*100)[:5], '%')
print('CustomerIDs with these anomalous values are:', df_ori[bool_error]['CustomerID'].unique())
With Quantity negative and UnitPrice negative: 0
With Quantity or UnitPrice negative or equals to zero: 11805
Values errors by percentage: 2.178 %
CustomerIDs with these anomalous values are: [14527. 15311. 17548. ... 12985. 15951. 16446.]
In [24]:
df_ori = df_ori[~(bool_error | df_ori['CustomerID'].isnull())]
In [25]:
details = describe(df_ori)
Data shape: (397884, 8)
___________________________
Data types:

 object            4
float64           2
int64             1
datetime64[ns]    1
Name: count, dtype: int64
___________________________
In [32]:
# Calculate the number of unique values in each column
unique_counts = df_ori.nunique()

# Sort the dataframe by the number of unique values in descending order
sorted_details = unique_counts.sort_values(ascending=False)

# Display the sorted dataframe
display(sorted_details)
InvoiceNo      18532
InvoiceDate    17282
CustomerID      4338
Description     3877
StockCode       3665
UnitPrice        440
Quantity         301
Country           37
dtype: int64
In [30]:
# Calculate skewness for numerical columns
skewness = df_ori.select_dtypes(include=['int64', 'float64']).apply(lambda x: x.skew())

# Sort the skewness values in descending order
sorted_skewness = skewness.sort_values(ascending=False)

# Display the sorted skewness values
print(sorted_skewness)
Quantity      409.892972
UnitPrice     204.032727
CustomerID      0.025729
dtype: float64
In [33]:
df_ori.describe()
Out[33]:
Quantity InvoiceDate UnitPrice CustomerID
count 397884.000000 397884 397884.000000 397884.000000
mean 12.988238 2011-07-10 23:41:23.511023360 3.116488 15294.423453
min 1.000000 2010-12-01 08:26:00 0.001000 12346.000000
25% 2.000000 2011-04-07 11:12:00 1.250000 13969.000000
50% 6.000000 2011-07-31 14:39:00 1.950000 15159.000000
75% 12.000000 2011-10-20 14:33:00 3.750000 16795.000000
max 80995.000000 2011-12-09 12:50:00 8142.750000 18287.000000
std 179.331775 NaN 22.097877 1713.141560
In [39]:
stock_desc = df_ori.groupby(['StockCode', 'Description']).count().reset_index()
stock_desc_count = stock_desc['StockCode'].value_counts().reset_index()
# Filter rows where the count is greater than 1
filtered_stock_desc_count = stock_desc_count[stock_desc_count['StockCode'].astype(str).apply(lambda x: x.isnumeric() and int(x) > 1)].head(2)

print(filtered_stock_desc_count)
  StockCode  count
0     23196      4
1     23236      4
In [40]:
df_ori[df_ori['StockCode'] == 23196]['Description'].unique()
Out[40]:
array(['RETRO LEAVES MAGNETIC NOTEPAD',
       'RETO LEAVES MAGNETIC SHOPPING LIST',
       'LEAVES MAGNETIC  SHOPPING LIST', 'VINTAGE LEAF MAGNETIC NOTEPAD'],
      dtype=object)
In [41]:
temp = df_ori['Description'].groupby(df_ori['StockCode']).unique().apply(pd.Series)
temp = temp[0].to_dict()
df_ori['Description'] = df_ori['StockCode'].map(temp)

df_ori['CustomerID'] = df_ori['CustomerID'].astype('int32')
In [42]:
df_ori[df_ori['StockCode'] == 23196]['Description'].unique()
Out[42]:
array(['RETRO LEAVES MAGNETIC NOTEPAD'], dtype=object)
In [44]:
# Calculate the number of unique values in each column
unique_counts = df_ori.nunique()

# Sort the dataframe by the number of unique values in descending order
sorted_details = unique_counts.sort_values(ascending=False)

# Display the sorted dataframe
display(sorted_details)
InvoiceNo      18532
InvoiceDate    17282
CustomerID      4338
StockCode       3665
Description     3647
UnitPrice        440
Quantity         301
Country           37
dtype: int64

Analyze sales We analyzed plotting of sales with respect to some of the other variables.

Market representation and country

In [45]:
df_ori['Internal'] = 'No'
df_ori.loc[df_ori['Country'] == 'United Kingdom', 'Internal'] = 'Yes'
fig = px.pie(df_ori, names='Internal', title='Market representation', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.show()

df_ori['Amount'] = df_ori['UnitPrice']*df_ori['Quantity']
temp = pd.DataFrame(df_ori.groupby('Country')['Amount'].sum()).reset_index().sort_values(by=['Amount'], ascending=False)
fig = px.bar(temp, x='Country', y='Amount', title='Amount sales by country',  color_discrete_sequence=px.colors.qualitative.Pastel)
fig.show()

Top Customers

In [46]:
temp = df_ori.groupby('CustomerID')[['Amount']].sum().sort_values(by=['Amount'], ascending=False)
ratio_sales_inplot = str(list(temp[:50].sum())[0] / list(temp.sum())[0] * 100)[:5] + ' %'
fig = px.bar(temp[:50].reset_index(), x='CustomerID', y='Amount', title='50 Best Customers by amount ('+ ratio_sales_inplot + ' of total amount of sales)', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_layout(xaxis_type = 'category')
fig.show()

ratio_sales_inplot = str(list(temp[:10].sum())[0] / list(temp.sum())[0] * 100)[:5] + ' %'
fig = px.bar(temp[:10].reset_index(), x='CustomerID', y='Amount', title='10 Best Customers by amount ('+ ratio_sales_inplot + ' of total amount of sales)', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_layout(xaxis_type = 'category')
fig.show()

temp = df_ori.groupby('CustomerID')[['Amount']].count().sort_values(by=['Amount'], ascending=False)
temp
ratio_sales_inplot = str(list(temp[:10].sum())[0] / list(temp.sum())[0] * 100)[:4] + ' %'
fig = px.bar(temp[:10].reset_index(), x='CustomerID', y='Amount', title='10 Best Customers by frecuency of sales ('+ ratio_sales_inplot + ' of total frequency of sales)', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_layout(xaxis_type = 'category')
fig.show()

Top products

In [48]:
temp_amount = df_ori.groupby(['StockCode', 'Description'])[['Amount']].sum().sort_values(by=['Amount'], ascending=False).reset_index()
ratio_sales_inplot_amount = str(list(temp_amount[['Amount']][:10].sum())[0] / list(temp_amount[['Amount']].sum())[0] * 100)[:5] + ' %'
fig_amount = px.bar(temp_amount[:10].reset_index(), x='Description', y='Amount', title='10 best products by amount ('+ ratio_sales_inplot_amount + ' of total amount of sales)', color_discrete_sequence=px.colors.qualitative.Pastel)
fig_amount.update_layout(xaxis_type='category')
fig_amount.update_traces(marker_color='rgb(246,207,113)')
fig_amount.show()

temp_count = df_ori.groupby(['StockCode', 'Description']).size().reset_index(name='Frequency')
ratio_sales_inplot_count = str(list(temp_count[['Frequency']][:10].sum())[0] / list(temp_count[['Frequency']].sum())[0] * 100)[:5] + ' %'
fig_count = px.bar(temp_count[:10].reset_index(), x='Description', y='Frequency', title='10 best products by frequency ('+ ratio_sales_inplot_count + ' of total frequency of sales)', color_discrete_sequence=px.colors.qualitative.Pastel)
fig_count.update_layout(xaxis_type='category')
fig_count.update_traces(marker_color='rgb(246,207,113)')
fig_count.show()

Customer Segementation with RFM We are going to use the Recency, Frequency, Monetary Model (RFM). As stated in the Wikipedia page, RFM stands for the three dimensions:

Recency – How recently did the customer purchase? Frequency – How often do they purchase? Monetary Value – How much do they spend? This analysis must be careful with the time window because can be biased or inaccurate if we try to span an extremely long duration.

In [49]:
snapshot_date = df_ori['InvoiceDate'].max() + timedelta(days=1)
print(snapshot_date)
2011-12-10 12:50:00
In [50]:
#Extract features for each customer
data_process = df_ori.groupby(['CustomerID']).agg({
        'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
        'InvoiceNo': 'count',
        'Amount': 'sum'})

#renaming
data_process.columns = ['Recency', 'Frequency', 'MonetaryValue']

data_process = data_process
data_process[:3]
Out[50]:
Recency Frequency MonetaryValue
CustomerID
12346 326 1 77183.60
12347 2 182 4310.00
12348 75 31 1797.24
In [51]:
fig, axes = plt.subplots(1, 3, figsize=(22, 5))
for i, feature in enumerate(list(data_process.columns)):
    sns.distplot(data_process[feature], ax=axes[i])
No description has been provided for this image

The data is skewed. Using log transformation we can improve the quality of the data for future analysis

In [52]:
data_process['Recency_log'] = data_process['Recency'].apply(math.log)
data_process['Frequency_log'] = data_process['Frequency'].apply(math.log)
data_process['MonetaryValue_log'] = data_process['MonetaryValue'].apply(math.log)
data_process[:3]
Out[52]:
Recency Frequency MonetaryValue Recency_log Frequency_log MonetaryValue_log
CustomerID
12346 326 1 77183.60 5.786897 0.000000 11.253942
12347 2 182 4310.00 0.693147 5.204007 8.368693
12348 75 31 1797.24 4.317488 3.433987 7.494007
In [53]:
fig, axes = plt.subplots(1, 3, figsize=(22, 5))
        
for i, feature in enumerate(list(data_process.columns[3:])):
    sns.distplot(data_process[feature], ax=axes[i])
No description has been provided for this image
In [54]:
scaler = MinMaxScaler()
#scaler = StandardScaler()
data_process_normalized = pd.DataFrame(scaler.fit_transform(data_process))
#renaming
data_process_normalized.columns = ['n_'+ i for i in data_process.columns]
data_process_normalized.describe()
Out[54]:
n_Recency n_Frequency n_MonetaryValue n_Recency_log n_Frequency_log n_MonetaryValue_log
count 4338.000000 4338.000000 4338.000000 4338.000000 4338.000000 4338.000000
mean 0.245406 0.011563 0.007318 0.635951 0.410325 0.469546
std 0.268135 0.029159 0.032081 0.241793 0.147873 0.112364
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.045576 0.002039 0.001084 0.487888 0.315929 0.392678
50% 0.134048 0.005098 0.002394 0.663683 0.414097 0.462699
75% 0.378016 0.012618 0.005917 0.836532 0.513518 0.543051
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

Now we are going to plot the inertia for each cluster from 1 to 15

In [55]:
SSE, max_k = [], 15
list_input = list(data_process_normalized.columns[3:])
for k in range(max_k):
    kmeans = KMeans(n_clusters=k+1, random_state=42).fit(data_process_normalized[list_input])
    SSE.append(kmeans.inertia_)

fig = go.Figure(data=go.Scatter(x=list(range(1, max_k+1)), y=SSE, ))
fig.update_traces(marker_size=14)
fig.show()

Using the elbow heuristic We decide to use k = 5 for our model.

In [56]:
model =  KMeans(n_clusters=5, random_state=42).fit(data_process_normalized[list_input])
data_process_normalized['cluster'] = model.predict(data_process_normalized[list_input])
fig = px.scatter_3d(data_process_normalized, x=list_input[0], y=list_input[1], z=list_input[2],
              color='cluster')
fig.show()
In [57]:
data_process_normalized[:3]
Out[57]:
n_Recency n_Frequency n_MonetaryValue n_Recency_log n_Frequency_log n_MonetaryValue_log cluster
0 0.871314 0.000000 0.275443 0.976814 0.000000 0.885101 0
1 0.002681 0.023069 0.015368 0.117002 0.580294 0.627984 2
2 0.198391 0.003824 0.006401 0.728782 0.382920 0.550037 4
In [58]:
data_process_normalized.groupby('cluster').agg({
    'n_Recency': ['mean', 'min', 'max'],
    'n_Frequency': ['mean', 'min', 'max'],
    'n_MonetaryValue': ['mean', 'min', 'max']
})
Out[58]:
n_Recency n_Frequency n_MonetaryValue
mean min max mean min max mean min max
cluster
0 0.619670 0.168901 1.000000 0.001886 0.000000 0.010706 0.001545 0.000000 0.275443
1 0.095597 0.005362 0.217158 0.002591 0.000000 0.012108 0.001461 0.000011 0.024072
2 0.005788 0.000000 0.018767 0.034290 0.000000 1.000000 0.026576 0.000415 1.000000
3 0.053130 0.016086 0.152815 0.020948 0.000892 0.208514 0.011788 0.000766 0.445788
4 0.283725 0.096515 0.994638 0.009454 0.000765 0.069080 0.004926 0.000469 0.158923

We have clustered your customers based on their recency, frequency, and monetary value. Here's a breakdown of the clusters based on the mean, minimum, and maximum values for each cluster:

Cluster 0: These customers have a relatively high recency, frequency, and monetary value compared to other clusters. Cluster 1: Customers in this cluster have a lower recency, frequency, and monetary value compared to Cluster 0 but higher than some other clusters. Cluster 2: These customers have a very low recency but a high frequency and monetary value. Cluster 3: Customers in this cluster have moderate recency, frequency, and monetary value. Cluster 4: This cluster represents customers with high recency but lower frequency and monetary value.

Here's an interpretation of the clusters based on the dataset:

Cluster 0: These are our most valuable customers. They have made recent purchases, buy frequently, and spend a significant amount per transaction. They are highly engaged and contribute significantly to your revenue.

Cluster 1: Customers in this cluster are somewhat engaged but not as much as those in Cluster 0. They have made purchases recently, buy moderately often, and spend moderately per transaction.

Cluster 2: This cluster represents customers who may not have made recent purchases but are highly loyal in terms of frequency and monetary value. They may be occasional buyers but tend to spend a lot when they do.

Cluster 3: These customers have moderate recency, frequency, and monetary value. They are neither highly engaged nor disengaged but fall somewhere in between.

Cluster 4: These are our least engaged customers. They have not made recent purchases, buy infrequently, and spend relatively little per transaction. They may need re-engagement strategies to increase their activity.

Based on this interpretation, one can tailor marketing, retention, and engagement strategies for each cluster. For example, for Cluster 0, you might focus on loyalty programs or personalized offers to maintain their high engagement. For Cluster 4, you might consider reactivation campaigns or targeted promotions to encourage them to make more purchases.